In [ ]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings
from sklearn.exceptions import FitFailedWarning
warnings.filterwarnings("ignore", category=FitFailedWarning)


from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from xgboost import XGBClassifier
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import BaggingClassifier



from sklearn.ensemble import RandomForestClassifier
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Your code here

# Reset display options to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')
In [ ]:
df = pd.read_csv(r"C:\Users\Admin\Downloads\archive (24)\water_potability.csv")
In [ ]:
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

Checking for missing values¶

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
In [ ]:
print("Missing total")
print(df.isna().sum())
print('---------')
print('Missing percentages')
print(df.isna().sum() / df.shape[0])
print('---------')
missing = []
columns = df.columns
for i, v in enumerate(df.isna().sum()):
    if v > 0:
        missing.append(columns[i])
print(missing)
Missing total
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
---------
Missing percentages
ph                 0.149878
Hardness           0.000000
Solids             0.000000
Chloramines        0.000000
Sulfate            0.238400
Conductivity       0.000000
Organic_carbon     0.000000
Trihalomethanes    0.049451
Turbidity          0.000000
Potability         0.000000
dtype: float64
---------
['ph', 'Sulfate', 'Trihalomethanes']
In [ ]:
df.columns
Out[ ]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')

Dealing with missing data using KNN imputer¶

In [ ]:
X = df.drop('Potability', axis=1)
imputer = KNNImputer()
X_imputed = imputer.fit_transform(X)
df_ = pd.DataFrame(X_imputed, columns=X.columns)
In [ ]:
y = df['Potability']
df = pd.concat([df_, y], axis=1)
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

Perform basic EDA¶

1. Do the histogram to check the distribution and the relationship between all the feature with the target, using hue too¶

  • After plotting the histogram, i have seen that there are normal distribution of the features, however, there are still slight skewness in some features

  • But i can see imbalance in the data where the number of class 0 is slightly larger than class 1

In [ ]:
for col in df.columns[:-1]:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df,x=col, hue='Potability', kde=True, alpha=0.7)
    plt.show()
    
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
  1. Checking for skewness The skewness is not drastically, so I will leave it there without transformation process
In [ ]:
print('Feature with high skewness:')
for i, v in enumerate(df.skew()):
    if v > 0.5 or v < -0.5:
        print(columns[i], ':', v)
Feature with high skewness:
Solids : 0.6216344855169127

3. Checking the correlation and relationship between features and the target using pair plot and heatmap¶

Though there are low correlation between the features and the target, and no clear relationship¶

In [ ]:
sns.pairplot(df)
C:\Users\Admin\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
Out[ ]:
<seaborn.axisgrid.PairGrid at 0x1fb9e911510>
No description has been provided for this image
In [ ]:
correlation = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation, annot=True, lw=1)
Out[ ]:
<Axes: >
No description has been provided for this image

3. Creating new features¶

  • My strategy is to base on the information given from the dataset of the thresholds of each chemical element that affect the Potability of water
In [ ]:
df.columns
Out[ ]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
In [ ]:
df['ph_level'] = pd.cut(df['ph'], bins=[0, 6.9, 7.1, 14], labels=['acidic', 'neutral', 'alkaline'], include_lowest=True)
df['tds_level'] = pd.cut(df['Solids'], bins=[0, 500, 1000, float('inf')], labels=['low', 'moderate', 'high'], include_lowest=True)
df['chloramines_level'] = pd.cut(df['Chloramines'],  bins=[0, 4, float('inf')], labels=['low', 'high'], include_lowest=True)
df['trihalomethanes_level'] = pd.cut(df['Trihalomethanes'], bins=[0, 80, float('inf')], labels=['safe', 'unsafe'], include_lowest=True)

ordinal_mapping = {
    'acidic': 0,
    'neutral': 1,
    'alkaline': 2,
    'low': 0,
    'moderate': 1,
    'high': 2,
    'safe': 0,
    'unsafe': 1,
    'low': 0,
    'high': 1
}

df['ph_level'] = df['ph_level'].map(ordinal_mapping)
df['tds_level'] = df['tds_level'].map(ordinal_mapping)
df['chloramines_level'] = df['chloramines_level'].map(ordinal_mapping)
df['trihalomethanes_level'] = df['trihalomethanes_level'].map(ordinal_mapping)
In [ ]:
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

In [ ]:
y 
Out[ ]:
0       0
1       0
2       0
3       0
4       0
       ..
3271    1
3272    1
3273    1
3274    1
3275    1
Name: Potability, Length: 3276, dtype: int64
In [ ]:
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

In [ ]:
correlation = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation, annot=True, lw=1)
Out[ ]:
<Axes: >
No description has been provided for this image

Dealing with imbalanced data in the target¶

So i did try to use a baseline models to check if fixing imbalanced target can improve the prediction, in this case i use logistic but it seems that i didn't have much improvements, so i will still use the balanced target

In [ ]:
df['Potability'].value_counts()
Out[ ]:
Potability
0    1998
1    1278
Name: count, dtype: int64
In [ ]:
X, y = df.drop('Potability', axis=1), df['Potability']
oversampler = SMOTE(sampling_strategy='auto', random_state=42)
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)

X_resampled, y_resampled = oversampler.fit_resample(X, y)
X_resampled, y_resampled = undersampler.fit_resample(X_resampled, y_resampled)
print(pd.Series(y_resampled).value_counts())
Potability
0    1998
1    1998
Name: count, dtype: int64
In [ ]:
df.Potability.value_counts()
Out[ ]:
Potability
0    1998
1    1278
Name: count, dtype: int64
In [ ]:
df_copy = df.copy()
df_copy['Potability'] = y_resampled
In [ ]:
df_copy
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

In [ ]:
correlation_after_resample = df_copy.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_after_resample, annot=True, lw=1)
Out[ ]:
<Axes: >
No description has been provided for this image
In [ ]:
X, y = df.drop('Potability', axis=1), df['Potability']
X_copy, y_copy = df_copy.drop('Potability', axis=1), df_copy['Potability']
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.2, random_state=42)
X_train_copy, X_test_copy, y_train_copy, y_test_copy = train_test_split(X_copy, y_copy, test_size=0.2, random_state=42)

scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
X_train_copy = scale.fit_transform(X_train_copy)
X_test_copy = scale.transform(X_test_copy)
                                                                       
lr = LogisticRegression()
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [100, 1000, 10000]
}

grid_base_line_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1, error_score='raise')
grid_base_line_lr.fit(X_train, y_train)
y_pred_lr = grid_base_line_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))

grid_base_line_lr.fit(X_train_copy, y_train_copy)
y_pred_lr_copy = grid_base_line_lr.predict(X_test_copy)
print(classification_report(y_test_copy, y_pred_lr_copy))
Fitting 5 folds for each of 45 candidates, totalling 225 fits
C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       412
           1       0.00      0.00      0.00       244

    accuracy                           0.63       656
   macro avg       0.31      0.50      0.39       656
weighted avg       0.39      0.63      0.48       656

Fitting 5 folds for each of 45 candidates, totalling 225 fits
              precision    recall  f1-score   support

           0       0.63      1.00      0.77       415
           1       0.00      0.00      0.00       241

    accuracy                           0.63       656
   macro avg       0.32      0.50      0.39       656
weighted avg       0.40      0.63      0.49       656

In [ ]:
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

In [ ]:
df_copy
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

5 Checking and dealing outliers using boxplot and zscore¶

I forgot to check for outliers So removing outliers didn't improve the accuracy and the performance in general. I think i need to use different models

In [ ]:
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

In [ ]:
fig, axs = plt.subplots(7, 2, figsize=(12, 24))
indices = 0
axs = axs.flatten()
for i in df_copy.columns:
    sns.boxplot(df[i], ax=axs[indices])
    axs[indices].set_xlabel(i)  # Set x-label
    indices += 1
plt.tight_layout()  # Adjust the layout
plt.show()
No description has been provided for this image
In [ ]:
def remove_outliers(df, columns):
    for column in columns:
        q1 = df[column].quantile(0.25)
        q3 = df[column].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5*iqr
        upper = q3 + 1.5*iqr
        df = df[(df[column] >= lower) & (df[column] <= upper )]
    return df

columns = ['ph', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
df_cleaned = remove_outliers(df, columns)
In [ ]:
df_cleaned
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3270 6.069616 186.659040 26138.780191 7.747547 345.700257 415.886955 12.067620 60.419921 3.669712 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

2908 rows × 14 columns

In [ ]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs', 'saga'],
    'max_iter': [100, 1000, 10000]
}
X, y = df_cleaned.drop('Potability', axis=1), df_cleaned['Potability']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
lr = LogisticRegression()
grid_remove_outliers = GridSearchCV(estimator=lr, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
grid_remove_outliers.fit(X_train, y_train)
y_pred = grid_remove_outliers.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
Fitting 5 folds for each of 90 candidates, totalling 450 fits
0.6219931271477663
              precision    recall  f1-score   support

           0       0.62      1.00      0.77       362
           1       0.00      0.00      0.00       220

    accuracy                           0.62       582
   macro avg       0.31      0.50      0.38       582
weighted avg       0.39      0.62      0.48       582

C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [0.6177987         nan 0.6177987  0.6177987  0.6177987  0.6177987
 0.6177987         nan 0.6177987  0.6177987  0.6177987  0.6177987
 0.6177987         nan 0.6177987  0.6177987  0.6177987  0.6177987
 0.6177987         nan 0.6177987  0.61822881 0.61822881 0.61822881
 0.6177987         nan 0.6177987  0.61822881 0.61822881 0.61822881
 0.6177987         nan 0.6177987  0.61822881 0.61822881 0.61822881
 0.6177987         nan 0.6177987  0.61822881 0.61822881 0.61822881
 0.6177987         nan 0.6177987  0.61822881 0.61822881 0.61822881
 0.6177987         nan 0.6177987  0.61822881 0.61822881 0.61822881
 0.61822881        nan 0.61822881 0.61822881 0.61822881 0.61822881
 0.61822881        nan 0.61822881 0.61822881 0.61822881 0.61822881
 0.61822881        nan 0.61822881 0.61822881 0.61822881 0.61822881
 0.61822881        nan 0.61822881 0.61822881 0.61822881 0.61822881
 0.61822881        nan 0.61822881 0.61822881 0.61822881 0.61822881
 0.61822881        nan 0.61822881 0.61822881 0.61822881 0.61822881]
  warnings.warn(
C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\Admin\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [ ]:
df
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

3276 rows × 14 columns

In [ ]:
xg = XGBClassifier()
param_grid_xg = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 500],
    'max_depth': [3, 5, 7],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}
grid_xg = GridSearchCV(estimator=xg, param_grid=param_grid_xg, cv=5, verbose=2, n_jobs=-1)
grid_xg.fit(X_train, y_train)
y_pred_xg = grid_xg.predict(X_test)
accuracy_xg = accuracy_score(y_test, y_pred_xg)
print(accuracy_xg)
print(classification_report(y_test, y_pred_xg))
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.6632302405498282
              precision    recall  f1-score   support

           0       0.66      0.96      0.78       362
           1       0.71      0.18      0.29       220

    accuracy                           0.66       582
   macro avg       0.69      0.57      0.53       582
weighted avg       0.68      0.66      0.59       582

  • So the xgboost classifier model i just used is applied for the dataframe after removing outliers but with imbalanced target. Down here will be xgboost after removing outliers and applying on to balanced taret data
In [ ]:
df.Potability.value_counts()
Out[ ]:
Potability
0    1998
1    1278
Name: count, dtype: int64
In [ ]:
df_remove_balance = pd.concat([X_resampled, y_resampled], axis=1)
In [ ]:
df_remove_balance
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity ph_level tds_level chloramines_level trihalomethanes_level Potability
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 2 1 1 1 0
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 1 1 0 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 2 1 1 0 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 2 1 1 1 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 2 1 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3991 4.959853 215.854869 9887.830755 6.954231 379.504731 527.479694 14.326638 52.388849 3.455045 0 1 1 0 1
3992 7.260849 180.403802 29854.901034 5.836932 352.999150 354.684607 17.914485 43.915782 3.456023 NaN 1 1 NaN 1
3993 7.221689 203.069029 13703.220377 7.823612 352.124567 412.320490 16.125741 80.058877 3.160818 2 1 1 1 1
3994 5.640447 291.665331 16113.257301 7.783664 327.812712 461.506642 18.299992 63.427855 3.941037 NaN 1 1 NaN 1
3995 7.777665 233.761579 16780.116147 6.123297 323.538055 520.285094 17.794741 60.343891 4.683335 2 1 1 0 1

3996 rows × 14 columns

In [ ]:
df_remove_balance.isna().sum()
Out[ ]:
ph                         0
Hardness                   0
Solids                     0
Chloramines                0
Sulfate                    0
Conductivity               0
Organic_carbon             0
Trihalomethanes            0
Turbidity                  0
ph_level                 398
tds_level                  0
chloramines_level         42
trihalomethanes_level    220
Potability                 0
dtype: int64
In [ ]:
X = df_remove_balance.drop('Potability', axis=1)
imputer = KNNImputer()
X_imputed = imputer.fit_transform(X)
df_ = pd.DataFrame(X_imputed, columns=X.columns)
In [ ]:
df_
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 2.0 1.0 1.0 1.0
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0.0 1.0 1.0 0.0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 2.0 1.0 1.0 0.0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 2.0 1.0 1.0 1.0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 2.0 1.0 1.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
3991 4.959853 215.854869 9887.830755 6.954231 379.504731 527.479694 14.326638 52.388849 3.455045 0.0 1.0 1.0 0.0
3992 7.260849 180.403802 29854.901034 5.836932 352.999150 354.684607 17.914485 43.915782 3.456023 1.6 1.0 1.0 0.2
3993 7.221689 203.069029 13703.220377 7.823612 352.124567 412.320490 16.125741 80.058877 3.160818 2.0 1.0 1.0 1.0
3994 5.640447 291.665331 16113.257301 7.783664 327.812712 461.506642 18.299992 63.427855 3.941037 1.0 1.0 1.0 0.0
3995 7.777665 233.761579 16780.116147 6.123297 323.538055 520.285094 17.794741 60.343891 4.683335 2.0 1.0 1.0 0.0

3996 rows × 13 columns

In [ ]:
df_remove_balance = pd.concat([df_, y_resampled], axis=1)
In [ ]:
df_remove_balance
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity ph_level tds_level chloramines_level trihalomethanes_level Potability
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 2.0 1.0 1.0 1.0 0
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0.0 1.0 1.0 0.0 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 2.0 1.0 1.0 0.0 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 2.0 1.0 1.0 1.0 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 2.0 1.0 1.0 0.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3991 4.959853 215.854869 9887.830755 6.954231 379.504731 527.479694 14.326638 52.388849 3.455045 0.0 1.0 1.0 0.0 1
3992 7.260849 180.403802 29854.901034 5.836932 352.999150 354.684607 17.914485 43.915782 3.456023 1.6 1.0 1.0 0.2 1
3993 7.221689 203.069029 13703.220377 7.823612 352.124567 412.320490 16.125741 80.058877 3.160818 2.0 1.0 1.0 1.0 1
3994 5.640447 291.665331 16113.257301 7.783664 327.812712 461.506642 18.299992 63.427855 3.941037 1.0 1.0 1.0 0.0 1
3995 7.777665 233.761579 16780.116147 6.123297 323.538055 520.285094 17.794741 60.343891 4.683335 2.0 1.0 1.0 0.0 1

3996 rows × 14 columns

In [ ]:
X, y = df_remove_balance.drop('Potability', axis=1), df_remove_balance['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
xg = XGBClassifier()
param_grid_xg = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 500],
    'max_depth': [3, 5, 7],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}
grid_xg = GridSearchCV(estimator=xg, param_grid=param_grid_xg, cv=5, verbose=2, n_jobs=-1)
grid_xg.fit(X_train, y_train)
y_pred_xg = grid_xg.predict(X_test)
accuracy_xg = accuracy_score(y_test, y_pred_xg)
print(accuracy_xg)
print(classification_report(y_test, y_pred_xg))
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.69
              precision    recall  f1-score   support

           0       0.69      0.73      0.71       415
           1       0.69      0.65      0.67       385

    accuracy                           0.69       800
   macro avg       0.69      0.69      0.69       800
weighted avg       0.69      0.69      0.69       800

In [ ]:
X, y = df_remove_balance.drop('Potability', axis=1), df_remove_balance['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
xg = XGBClassifier()
param_grid_xg = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 500],
    'max_depth': [3, 5, 7],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}
grid_xg = GridSearchCV(estimator=xg, param_grid=param_grid_xg, cv=5, verbose=2, n_jobs=-1)
grid_xg.fit(X_train, y_train)
y_pred_xg = grid_xg.predict(X_test)
accuracy_xg = accuracy_score(y_test, y_pred_xg)
print(accuracy_xg)
print(classification_report(y_test, y_pred_xg))
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.69625
              precision    recall  f1-score   support

           0       0.70      0.73      0.71       415
           1       0.69      0.66      0.68       385

    accuracy                           0.70       800
   macro avg       0.70      0.70      0.70       800
weighted avg       0.70      0.70      0.70       800

I have removed outliers, balanced the target and using XGB to check if those methods are optimal. After that i will use KBest and PCA then check for other models and do hyperparameters tuning¶

One mistake that i made is that i didn't scale the features, but the accuracy increase as well as the recall in the class 1 to 0.69 and 0.65 respectively¶

And after removing outliers and balancing the target, the accuracy incrased to 70%. I will try to do KBest and PCA with other models now¶

In [ ]:
df_remove_balance
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity ph_level tds_level chloramines_level trihalomethanes_level Potability
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 2.0 1.0 1.0 1.0 0
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0.0 1.0 1.0 0.0 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 2.0 1.0 1.0 0.0 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 2.0 1.0 1.0 1.0 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 2.0 1.0 1.0 0.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3991 4.959853 215.854869 9887.830755 6.954231 379.504731 527.479694 14.326638 52.388849 3.455045 0.0 1.0 1.0 0.0 1
3992 7.260849 180.403802 29854.901034 5.836932 352.999150 354.684607 17.914485 43.915782 3.456023 1.6 1.0 1.0 0.2 1
3993 7.221689 203.069029 13703.220377 7.823612 352.124567 412.320490 16.125741 80.058877 3.160818 2.0 1.0 1.0 1.0 1
3994 5.640447 291.665331 16113.257301 7.783664 327.812712 461.506642 18.299992 63.427855 3.941037 1.0 1.0 1.0 0.0 1
3995 7.777665 233.761579 16780.116147 6.123297 323.538055 520.285094 17.794741 60.343891 4.683335 2.0 1.0 1.0 0.0 1

3996 rows × 14 columns

In [ ]:
df_remove_balance.Potability.value_counts()
Out[ ]:
Potability
0    1998
1    1998
Name: count, dtype: int64
In [ ]:
X, y = df_remove_balance.drop('Potability', axis=1), df_remove_balance['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

k = 10
In [ ]:
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_features_indices = selector.get_support(indices=True)

selected_features = X.columns[selected_features_indices]
print('Selected features: ', selected_features)
Selected features:  Index(['Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'ph_level', 'tds_level', 'chloramines_level',
       'trihalomethanes_level'],
      dtype='object')
In [ ]:
xg = XGBClassifier()
param_grid_xg = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 500],
    'max_depth': [3, 5, 7],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}
grid_xg = GridSearchCV(estimator=xg, param_grid=param_grid_xg, cv=5, verbose=2, n_jobs=-1)
grid_xg.fit(X_train_selected, y_train)
y_pred_kbest = grid_xg.predict(X_test_selected)
accuracy_xg_kbest = accuracy_score(y_test, y_pred_kbest)
print(accuracy_xg_kbest)
print(classification_report(y_test, y_pred_kbest))
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
0.6875
              precision    recall  f1-score   support

           0       0.68      0.76      0.72       415
           1       0.70      0.61      0.65       385

    accuracy                           0.69       800
   macro avg       0.69      0.68      0.68       800
weighted avg       0.69      0.69      0.69       800

So i did use KBest, first i tried k=5 which droped the score down to 63 % which is shitty, then i increased the k to 10 and the score increase as well to 69% which is not notable nor good compared to previous score¶

I will try other models: Random Forest¶

In [ ]:
rfc = RandomForestClassifier()
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, cv=5, verbose=2, n_jobs=-1)
grid_rfc.fit(X_train_selected, y_train)
y_pred_kbest = grid_rfc.predict(X_test_selected)
accuracy_rfc_kbest = accuracy_score(y_test, y_pred_kbest)
print(accuracy_rfc_kbest)
print(classification_report(y_test, y_pred_kbest))
Fitting 5 folds for each of 81 candidates, totalling 405 fits
0.70625
              precision    recall  f1-score   support

           0       0.71      0.74      0.72       415
           1       0.70      0.67      0.69       385

    accuracy                           0.71       800
   macro avg       0.71      0.70      0.71       800
weighted avg       0.71      0.71      0.71       800

In [ ]:
X, y = df_remove_balance.drop('Potability', axis=1), df_remove_balance['Potability']
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size=0.2, random_state=42)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
rfc = RandomForestClassifier()
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, cv=5, verbose=2, n_jobs=-1)
grid_rfc.fit(X_train, y_train)
y_pred = grid_rfc.predict(X_test)
accuracy_rfc = accuracy_score(y_test, y_pred)
print(accuracy_rfc)
print(classification_report(y_test, y_pred))
Fitting 5 folds for each of 81 candidates, totalling 405 fits
0.73375
              precision    recall  f1-score   support

           0       0.73      0.76      0.75       415
           1       0.73      0.70      0.72       385

    accuracy                           0.73       800
   macro avg       0.73      0.73      0.73       800
weighted avg       0.73      0.73      0.73       800

So i used random forest without Kbest, removed outliers, balanced the target, and the final for today is 72%¶

In [ ]:
df_remove_balance
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity ph_level tds_level chloramines_level trihalomethanes_level Potability
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 2.0 1.0 1.0 1.0 0
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0.0 1.0 1.0 0.0 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 2.0 1.0 1.0 0.0 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 2.0 1.0 1.0 1.0 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 2.0 1.0 1.0 0.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3991 4.959853 215.854869 9887.830755 6.954231 379.504731 527.479694 14.326638 52.388849 3.455045 0.0 1.0 1.0 0.0 1
3992 7.260849 180.403802 29854.901034 5.836932 352.999150 354.684607 17.914485 43.915782 3.456023 1.6 1.0 1.0 0.2 1
3993 7.221689 203.069029 13703.220377 7.823612 352.124567 412.320490 16.125741 80.058877 3.160818 2.0 1.0 1.0 1.0 1
3994 5.640447 291.665331 16113.257301 7.783664 327.812712 461.506642 18.299992 63.427855 3.941037 1.0 1.0 1.0 0.0 1
3995 7.777665 233.761579 16780.116147 6.123297 323.538055 520.285094 17.794741 60.343891 4.683335 2.0 1.0 1.0 0.0 1

3996 rows × 14 columns

In [ ]:
df_remove_balance.skew()
Out[ ]:
ph                        0.034198
Hardness                 -0.016929
Solids                    0.615021
Chloramines              -0.027961
Sulfate                  -0.058976
Conductivity              0.276901
Organic_carbon            0.017890
Trihalomethanes          -0.078986
Turbidity                -0.012541
ph_level                 -0.084981
tds_level               -63.213923
chloramines_level        -6.246821
trihalomethanes_level     1.766519
Potability                0.000000
dtype: float64
In [ ]:
for col in df_remove_balance.columns:
    plt.figure(figsize=(12, 8))
    sns.histplot(df_remove_balance[col])
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
df_remove_balance.tds_level.value_counts()
Out[ ]:
tds_level
1.0    3995
0.0       1
Name: count, dtype: int64
In [ ]:
df_remove_balance.drop('tds_level', axis=1, inplace=True)
In [ ]:
X, y = df_remove_balance.drop('Potability', axis=1), df_remove_balance['Potability']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test  = scale.transform(X_test)
rfc = RandomForestClassifier(random_state=42)
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid_rfc, verbose=2, n_jobs=-1)
grid_search_rfc.fit(X_train, y_train)
y_pred = grid_search_rfc.predict(X_test)
accur = accuracy_score(y_test, y_pred)
print('Accuracy score: ', accur)
print(classification_report(y_test, y_pred))
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Accuracy score:  0.72125
              precision    recall  f1-score   support

           0       0.72      0.76      0.74       415
           1       0.72      0.68      0.70       385

    accuracy                           0.72       800
   macro avg       0.72      0.72      0.72       800
weighted avg       0.72      0.72      0.72       800

In [ ]:
from tensorflow.keras.layers import Dropout
X = df_cleaned.drop('Potability', axis=1)
y = df_cleaned['Potability']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1], )),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)
C:\Users\Admin\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
Epoch 1/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 14s 16ms/step - accuracy: 0.5062 - loss: 0.7854 - val_accuracy: 0.6030 - val_loss: 0.6770
Epoch 2/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - accuracy: 0.5597 - loss: 0.7183 - val_accuracy: 0.6030 - val_loss: 0.6709
Epoch 3/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.5974 - loss: 0.6865 - val_accuracy: 0.6009 - val_loss: 0.6707
Epoch 4/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.5787 - loss: 0.6804 - val_accuracy: 0.6030 - val_loss: 0.6675
Epoch 5/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.5838 - loss: 0.6858 - val_accuracy: 0.6030 - val_loss: 0.6673
Epoch 6/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 9ms/step - accuracy: 0.6314 - loss: 0.6613 - val_accuracy: 0.6009 - val_loss: 0.6673
Epoch 7/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - accuracy: 0.6198 - loss: 0.6665 - val_accuracy: 0.6009 - val_loss: 0.6663
Epoch 8/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.6111 - loss: 0.6733 - val_accuracy: 0.6009 - val_loss: 0.6655
Epoch 9/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - accuracy: 0.6109 - loss: 0.6688 - val_accuracy: 0.6009 - val_loss: 0.6653
Epoch 10/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 13ms/step - accuracy: 0.6229 - loss: 0.6556 - val_accuracy: 0.6030 - val_loss: 0.6644
Epoch 11/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.6267 - loss: 0.6553 - val_accuracy: 0.6073 - val_loss: 0.6632
Epoch 12/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - accuracy: 0.6362 - loss: 0.6572 - val_accuracy: 0.6052 - val_loss: 0.6619
Epoch 13/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.6129 - loss: 0.6556 - val_accuracy: 0.6094 - val_loss: 0.6594
Epoch 14/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6191 - loss: 0.6611 - val_accuracy: 0.6137 - val_loss: 0.6572
Epoch 15/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6420 - loss: 0.6460 - val_accuracy: 0.6159 - val_loss: 0.6564
Epoch 16/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6338 - loss: 0.6514 - val_accuracy: 0.6223 - val_loss: 0.6546
Epoch 17/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6597 - loss: 0.6376 - val_accuracy: 0.6309 - val_loss: 0.6534
Epoch 18/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6441 - loss: 0.6521 - val_accuracy: 0.6309 - val_loss: 0.6542
Epoch 19/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6327 - loss: 0.6490 - val_accuracy: 0.6330 - val_loss: 0.6520
Epoch 20/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6352 - loss: 0.6397 - val_accuracy: 0.6330 - val_loss: 0.6489
Epoch 21/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6524 - loss: 0.6411 - val_accuracy: 0.6330 - val_loss: 0.6476
Epoch 22/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6425 - loss: 0.6444 - val_accuracy: 0.6245 - val_loss: 0.6464
Epoch 23/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6647 - loss: 0.6266 - val_accuracy: 0.6352 - val_loss: 0.6454
Epoch 24/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6538 - loss: 0.6327 - val_accuracy: 0.6373 - val_loss: 0.6443
Epoch 25/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6640 - loss: 0.6283 - val_accuracy: 0.6438 - val_loss: 0.6418
Epoch 26/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6633 - loss: 0.6311 - val_accuracy: 0.6438 - val_loss: 0.6409
Epoch 27/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6402 - loss: 0.6452 - val_accuracy: 0.6416 - val_loss: 0.6421
Epoch 28/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6558 - loss: 0.6235 - val_accuracy: 0.6373 - val_loss: 0.6396
Epoch 29/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6681 - loss: 0.6338 - val_accuracy: 0.6373 - val_loss: 0.6403
Epoch 30/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.6532 - loss: 0.6339 - val_accuracy: 0.6416 - val_loss: 0.6414
Epoch 31/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 8ms/step - accuracy: 0.6407 - loss: 0.6295 - val_accuracy: 0.6395 - val_loss: 0.6404
Epoch 32/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6604 - loss: 0.6210 - val_accuracy: 0.6373 - val_loss: 0.6405
Epoch 33/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6541 - loss: 0.6285 - val_accuracy: 0.6373 - val_loss: 0.6383
Epoch 34/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6704 - loss: 0.6262 - val_accuracy: 0.6330 - val_loss: 0.6367
Epoch 35/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6630 - loss: 0.6238 - val_accuracy: 0.6309 - val_loss: 0.6351
Epoch 36/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6494 - loss: 0.6327 - val_accuracy: 0.6352 - val_loss: 0.6362
Epoch 37/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6581 - loss: 0.6226 - val_accuracy: 0.6309 - val_loss: 0.6364
Epoch 38/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6693 - loss: 0.6143 - val_accuracy: 0.6330 - val_loss: 0.6355
Epoch 39/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.6566 - loss: 0.6296 - val_accuracy: 0.6395 - val_loss: 0.6343
Epoch 40/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6593 - loss: 0.6181 - val_accuracy: 0.6309 - val_loss: 0.6338
Epoch 41/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6753 - loss: 0.6152 - val_accuracy: 0.6309 - val_loss: 0.6334
Epoch 42/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6635 - loss: 0.6210 - val_accuracy: 0.6438 - val_loss: 0.6343
Epoch 43/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6696 - loss: 0.6157 - val_accuracy: 0.6395 - val_loss: 0.6359
Epoch 44/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.6718 - loss: 0.6161 - val_accuracy: 0.6352 - val_loss: 0.6347
Epoch 45/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6743 - loss: 0.6098 - val_accuracy: 0.6373 - val_loss: 0.6338
Epoch 46/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6772 - loss: 0.6156 - val_accuracy: 0.6373 - val_loss: 0.6346
Epoch 47/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6732 - loss: 0.6192 - val_accuracy: 0.6395 - val_loss: 0.6343
Epoch 48/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 1s 7ms/step - accuracy: 0.6729 - loss: 0.6095 - val_accuracy: 0.6416 - val_loss: 0.6330
Epoch 49/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.6596 - loss: 0.6246 - val_accuracy: 0.6438 - val_loss: 0.6328
Epoch 50/50
59/59 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.6575 - loss: 0.6375 - val_accuracy: 0.6352 - val_loss: 0.6340
19/19 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6652 - loss: 0.6157 
Test Loss: 0.6122020483016968
Test Accuracy: 0.6752577424049377
In [ ]:
df_cleaned
Out[ ]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability ph_level tds_level chloramines_level trihalomethanes_level
0 7.156857 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0 2 1 1 1
1 3.716080 129.422921 18630.057858 6.635246 336.094350 592.885359 15.180013 56.329076 4.500656 0 0 1 1 0
2 8.099124 224.236259 19909.541732 9.275884 330.449166 418.606213 16.868637 66.420093 3.055934 0 2 1 1 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0 2 1 1 1
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0 2 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3270 6.069616 186.659040 26138.780191 7.747547 345.700257 415.886955 12.067620 60.419921 3.669712 1 0 1 1 0
3272 7.808856 193.553212 17329.802160 8.061362 364.091541 392.449580 19.903225 64.327280 2.798243 1 2 1 1 0
3273 9.419510 175.762646 33155.578218 7.350233 327.357588 432.044783 11.039070 69.845400 3.298875 1 2 1 1 0
3274 5.126763 230.603758 11983.869376 6.303357 325.952434 402.883113 11.168946 77.488213 4.708658 1 0 1 1 0
3275 7.874671 195.102299 17404.177061 7.509306 345.728295 327.459760 16.140368 78.698446 2.309149 1 2 1 1 0

2908 rows × 14 columns

I would like to use bagging to check for improvement¶

In [ ]: